{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AWS S3 support\n",
    " * Streams in data that is needed \n",
    " * Useful is you spin down/up a AWS machine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-13T14:25:46.414271Z",
     "start_time": "2019-07-13T14:25:45.538770Z"
    }
   },
   "outputs": [],
   "source": [
    "import vaex\n",
    "import numpy as np\n",
    "df = vaex.open('s3://vaex/taxi/yellow_taxi_2015_f32s.hdf5?anon=true')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-13T14:25:51.268969Z",
     "start_time": "2019-07-13T14:25:51.197831Z"
    }
   },
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-13T14:26:05.626081Z",
     "start_time": "2019-07-13T14:26:05.221659Z"
    }
   },
   "outputs": [],
   "source": [
    "df.passenger_count.sum(progress=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remote dataframe\n",
    " * Data at server\n",
    " * State changes at client\n",
    " * Server is stateless\n",
    "    * but does some caching for optmization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-13T14:26:35.185949Z",
     "start_time": "2019-07-13T14:26:33.871336Z"
    }
   },
   "outputs": [],
   "source": [
    "token = open('token-STSci.txt').read().strip()\n",
    "df = vaex.open(f'ws://ec2-18-222-183-211.us-east-2.compute.amazonaws.com:9000/gaia_ps1_nochunk?token_trusted={token}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-13T14:26:37.869718Z",
     "start_time": "2019-07-13T14:26:36.593555Z"
    }
   },
   "outputs": [],
   "source": [
    "df.plot('ra', 'dec', f='log')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-13T14:26:49.433166Z",
     "start_time": "2019-07-13T14:26:49.084163Z"
    }
   },
   "outputs": [],
   "source": [
    "np.deg2rad(df.ra)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# xarray support\n",
    " * binby instead of groupby"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-12T20:14:54.784887Z",
     "start_time": "2019-07-12T20:14:48.464825Z"
    }
   },
   "outputs": [],
   "source": [
    "import vaex\n",
    "df = vaex.open('/data/yellow_taxi_2009_2015_f32.hdf5')\n",
    "df = df.dropna(column_names=['dropoff_latitude', 'dropoff_longitude', 'pickup_latitude'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-11T22:34:51.365960Z",
     "start_time": "2019-07-11T22:34:34.556457Z"
    }
   },
   "outputs": [],
   "source": [
    "# Define a mapping dictionary\n",
    "map_payment_type = {'csh': 2, 'crd': 1, 'cash': 2, '1': 1, 'cas': 2, '2': 2, 'credit': 1, 'cre': 1, 'unk': 5, \n",
    "                    'noc': 3, 'no charge': 3, '3':3, 'dis': 4, 'no ': 3, '4': 4, 'dispute': 4, 'na ': 5, '5':5}\n",
    "\n",
    "df['payment_type'] = df.payment_type.str.lower().map(map_payment_type, \n",
    "                                                                  default_value=7, \n",
    "                                                                  allow_missing=True) -1\n",
    "df.categorize(df.payment_type, labels=['Credit card', 'Cash', 'No charge', 'Dispute', 'Unknown', 'Voided trip', 'NA'],\n",
    "             check=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-11T22:35:46.180738Z",
     "start_time": "2019-07-11T22:34:51.367955Z"
    }
   },
   "outputs": [],
   "source": [
    "da = df.binby([\n",
    "    vaex.groupby.BinnerTime.per_month(df.pickup_datetime),\n",
    "    df.payment_type\n",
    "], agg='count')\n",
    "da"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-07-11T22:35:54.767941Z",
     "start_time": "2019-07-11T22:35:54.246847Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pylab as plt\n",
    "plt.figure(figsize=(10,6), dpi=200)\n",
    "np.log10(da).plot(hue='payment_type');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
